ADD PARAGRAPH HERE
ADD PARAGRAPH AND ALL DATA SOURCES HERE
#cleanign data
marketing_campaign <- read_delim("marketing_campaign.csv",
"\t", escape_double = FALSE, trim_ws = TRUE)
## Rows: 2240 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): Education, Marital_Status, Dt_Customer
## dbl (26): ID, Year_Birth, Income, Kidhome, Teenhome, Recency, MntWines, MntF...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#View(marketing_campaign)
glimpse(marketing_campaign)
## Rows: 2,240
## Columns: 29
## $ ID <dbl> 5524, 2174, 4141, 6182, 5324, 7446, 965, 6177, 485…
## $ Year_Birth <dbl> 1957, 1954, 1965, 1984, 1981, 1967, 1971, 1985, 19…
## $ Education <chr> "Graduation", "Graduation", "Graduation", "Graduat…
## $ Marital_Status <chr> "Single", "Single", "Together", "Together", "Marri…
## $ Income <dbl> 58138, 46344, 71613, 26646, 58293, 62513, 55635, 3…
## $ Kidhome <dbl> 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,…
## $ Teenhome <dbl> 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,…
## $ Dt_Customer <chr> "04-09-2012", "08-03-2014", "21-08-2013", "10-02-2…
## $ Recency <dbl> 58, 38, 26, 26, 94, 16, 34, 32, 19, 68, 11, 59, 82…
## $ MntWines <dbl> 635, 11, 426, 11, 173, 520, 235, 76, 14, 28, 5, 6,…
## $ MntFruits <dbl> 88, 1, 49, 4, 43, 42, 65, 10, 0, 0, 5, 16, 61, 2, …
## $ MntMeatProducts <dbl> 546, 6, 127, 20, 118, 98, 164, 56, 24, 6, 6, 11, 4…
## $ MntFishProducts <dbl> 172, 2, 111, 10, 46, 0, 50, 3, 3, 1, 0, 11, 225, 3…
## $ MntSweetProducts <dbl> 88, 1, 21, 3, 27, 42, 49, 1, 3, 1, 2, 1, 112, 5, 1…
## $ MntGoldProds <dbl> 88, 6, 42, 5, 15, 14, 27, 23, 2, 13, 1, 16, 30, 14…
## $ NumDealsPurchases <dbl> 3, 2, 1, 2, 5, 2, 4, 2, 1, 1, 1, 1, 1, 3, 1, 1, 3,…
## $ NumWebPurchases <dbl> 8, 1, 8, 2, 5, 6, 7, 4, 3, 1, 1, 2, 3, 6, 1, 7, 3,…
## $ NumCatalogPurchases <dbl> 10, 1, 2, 0, 3, 4, 3, 0, 0, 0, 0, 0, 4, 1, 0, 6, 0…
## $ NumStorePurchases <dbl> 4, 2, 10, 4, 6, 10, 7, 4, 2, 0, 2, 3, 8, 5, 3, 12,…
## $ NumWebVisitsMonth <dbl> 7, 5, 4, 6, 5, 6, 6, 8, 9, 20, 7, 8, 2, 6, 8, 3, 8…
## $ AcceptedCmp3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,…
## $ AcceptedCmp4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ AcceptedCmp5 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ AcceptedCmp1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ AcceptedCmp2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Complain <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Z_CostContact <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,…
## $ Z_Revenue <dbl> 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11…
## $ Response <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,…
skim(marketing_campaign)
| Name | marketing_campaign |
| Number of rows | 2240 |
| Number of columns | 29 |
| _______________________ | |
| Column type frequency: | |
| character | 3 |
| numeric | 26 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Education | 0 | 1 | 3 | 10 | 0 | 5 | 0 |
| Marital_Status | 0 | 1 | 4 | 8 | 0 | 8 | 0 |
| Dt_Customer | 0 | 1 | 10 | 10 | 0 | 663 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| ID | 0 | 1.00 | 5592.16 | 3246.66 | 0 | 2828.25 | 5458.5 | 8427.75 | 11191 | ▇▇▇▇▇ |
| Year_Birth | 0 | 1.00 | 1968.81 | 11.98 | 1893 | 1959.00 | 1970.0 | 1977.00 | 1996 | ▁▁▂▇▅ |
| Income | 24 | 0.99 | 52247.25 | 25173.08 | 1730 | 35303.00 | 51381.5 | 68522.00 | 666666 | ▇▁▁▁▁ |
| Kidhome | 0 | 1.00 | 0.44 | 0.54 | 0 | 0.00 | 0.0 | 1.00 | 2 | ▇▁▆▁▁ |
| Teenhome | 0 | 1.00 | 0.51 | 0.54 | 0 | 0.00 | 0.0 | 1.00 | 2 | ▇▁▇▁▁ |
| Recency | 0 | 1.00 | 49.11 | 28.96 | 0 | 24.00 | 49.0 | 74.00 | 99 | ▇▇▇▇▇ |
| MntWines | 0 | 1.00 | 303.94 | 336.60 | 0 | 23.75 | 173.5 | 504.25 | 1493 | ▇▂▂▁▁ |
| MntFruits | 0 | 1.00 | 26.30 | 39.77 | 0 | 1.00 | 8.0 | 33.00 | 199 | ▇▁▁▁▁ |
| MntMeatProducts | 0 | 1.00 | 166.95 | 225.72 | 0 | 16.00 | 67.0 | 232.00 | 1725 | ▇▁▁▁▁ |
| MntFishProducts | 0 | 1.00 | 37.53 | 54.63 | 0 | 3.00 | 12.0 | 50.00 | 259 | ▇▁▁▁▁ |
| MntSweetProducts | 0 | 1.00 | 27.06 | 41.28 | 0 | 1.00 | 8.0 | 33.00 | 263 | ▇▁▁▁▁ |
| MntGoldProds | 0 | 1.00 | 44.02 | 52.17 | 0 | 9.00 | 24.0 | 56.00 | 362 | ▇▁▁▁▁ |
| NumDealsPurchases | 0 | 1.00 | 2.33 | 1.93 | 0 | 1.00 | 2.0 | 3.00 | 15 | ▇▂▁▁▁ |
| NumWebPurchases | 0 | 1.00 | 4.08 | 2.78 | 0 | 2.00 | 4.0 | 6.00 | 27 | ▇▃▁▁▁ |
| NumCatalogPurchases | 0 | 1.00 | 2.66 | 2.92 | 0 | 0.00 | 2.0 | 4.00 | 28 | ▇▂▁▁▁ |
| NumStorePurchases | 0 | 1.00 | 5.79 | 3.25 | 0 | 3.00 | 5.0 | 8.00 | 13 | ▂▇▂▃▂ |
| NumWebVisitsMonth | 0 | 1.00 | 5.32 | 2.43 | 0 | 3.00 | 6.0 | 7.00 | 20 | ▅▇▁▁▁ |
| AcceptedCmp3 | 0 | 1.00 | 0.07 | 0.26 | 0 | 0.00 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
| AcceptedCmp4 | 0 | 1.00 | 0.07 | 0.26 | 0 | 0.00 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
| AcceptedCmp5 | 0 | 1.00 | 0.07 | 0.26 | 0 | 0.00 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
| AcceptedCmp1 | 0 | 1.00 | 0.06 | 0.25 | 0 | 0.00 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
| AcceptedCmp2 | 0 | 1.00 | 0.01 | 0.11 | 0 | 0.00 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
| Complain | 0 | 1.00 | 0.01 | 0.10 | 0 | 0.00 | 0.0 | 0.00 | 1 | ▇▁▁▁▁ |
| Z_CostContact | 0 | 1.00 | 3.00 | 0.00 | 3 | 3.00 | 3.0 | 3.00 | 3 | ▁▁▇▁▁ |
| Z_Revenue | 0 | 1.00 | 11.00 | 0.00 | 11 | 11.00 | 11.0 | 11.00 | 11 | ▁▁▇▁▁ |
| Response | 0 | 1.00 | 0.15 | 0.36 | 0 | 0.00 | 0.0 | 0.00 | 1 | ▇▁▁▁▂ |
#we see that income column has 24 NA values, we can remove those rows
#we also notice that a few year of births are impractically old, we remove all years before 1925
profile_clean <- marketing_campaign %>%
janitor::clean_names() %>%
drop_na(income) %>%
filter(year_birth > 1925)
skimr::skim(profile_clean)
| Name | profile_clean |
| Number of rows | 2213 |
| Number of columns | 29 |
| _______________________ | |
| Column type frequency: | |
| character | 3 |
| numeric | 26 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| education | 0 | 1 | 3 | 10 | 0 | 5 | 0 |
| marital_status | 0 | 1 | 4 | 8 | 0 | 8 | 0 |
| dt_customer | 0 | 1 | 10 | 10 | 0 | 662 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1 | 5586.90 | 3247.82 | 0 | 2815 | 5455 | 8420 | 11191 | ▇▇▇▇▇ |
| year_birth | 0 | 1 | 1968.92 | 11.70 | 1940 | 1959 | 1970 | 1977 | 1996 | ▂▆▇▇▂ |
| income | 0 | 1 | 52236.58 | 25178.60 | 1730 | 35246 | 51373 | 68487 | 666666 | ▇▁▁▁▁ |
| kidhome | 0 | 1 | 0.44 | 0.54 | 0 | 0 | 0 | 1 | 2 | ▇▁▆▁▁ |
| teenhome | 0 | 1 | 0.51 | 0.54 | 0 | 0 | 0 | 1 | 2 | ▇▁▇▁▁ |
| recency | 0 | 1 | 49.01 | 28.94 | 0 | 24 | 49 | 74 | 99 | ▇▇▇▇▇ |
| mnt_wines | 0 | 1 | 305.15 | 337.31 | 0 | 24 | 175 | 505 | 1493 | ▇▂▂▁▁ |
| mnt_fruits | 0 | 1 | 26.32 | 39.74 | 0 | 2 | 8 | 33 | 199 | ▇▁▁▁▁ |
| mnt_meat_products | 0 | 1 | 166.96 | 224.23 | 0 | 16 | 68 | 232 | 1725 | ▇▁▁▁▁ |
| mnt_fish_products | 0 | 1 | 37.64 | 54.76 | 0 | 3 | 12 | 50 | 259 | ▇▁▁▁▁ |
| mnt_sweet_products | 0 | 1 | 27.03 | 41.09 | 0 | 1 | 8 | 33 | 262 | ▇▁▁▁▁ |
| mnt_gold_prods | 0 | 1 | 43.91 | 51.70 | 0 | 9 | 24 | 56 | 321 | ▇▂▁▁▁ |
| num_deals_purchases | 0 | 1 | 2.33 | 1.92 | 0 | 1 | 2 | 3 | 15 | ▇▂▁▁▁ |
| num_web_purchases | 0 | 1 | 4.09 | 2.74 | 0 | 2 | 4 | 6 | 27 | ▇▃▁▁▁ |
| num_catalog_purchases | 0 | 1 | 2.67 | 2.93 | 0 | 0 | 2 | 4 | 28 | ▇▂▁▁▁ |
| num_store_purchases | 0 | 1 | 5.81 | 3.25 | 0 | 3 | 5 | 8 | 13 | ▂▇▂▃▂ |
| num_web_visits_month | 0 | 1 | 5.32 | 2.43 | 0 | 3 | 6 | 7 | 20 | ▅▇▁▁▁ |
| accepted_cmp3 | 0 | 1 | 0.07 | 0.26 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| accepted_cmp4 | 0 | 1 | 0.07 | 0.26 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| accepted_cmp5 | 0 | 1 | 0.07 | 0.26 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| accepted_cmp1 | 0 | 1 | 0.06 | 0.25 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| accepted_cmp2 | 0 | 1 | 0.01 | 0.12 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| complain | 0 | 1 | 0.01 | 0.09 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| z_cost_contact | 0 | 1 | 3.00 | 0.00 | 3 | 3 | 3 | 3 | 3 | ▁▁▇▁▁ |
| z_revenue | 0 | 1 | 11.00 | 0.00 | 11 | 11 | 11 | 11 | 11 | ▁▁▇▁▁ |
| response | 0 | 1 | 0.15 | 0.36 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▂ |
#Clusters
PLS SEE AT BOTTOM AGAIN
#combining some columns into 1
df1=na.omit(profile_clean)
df1['Age']= 2021-df1$year_birth
df1['Child']=df1$kidhome+df1$teenhome
df1['total_spent']=df1$mnt_meat_products+df1$mnt_fish_products+df1$mnt_wines+df1$mnt_fruits+df1$mnt_sweet_products+df1$mnt_gold_prods
df1['accepted']=df1$accepted_cmp1+df1$accepted_cmp2+df1$accepted_cmp3+df1$accepted_cmp4+df1$accepted_cmp5
head(df1)
| id | year_birth | education | marital_status | income | kidhome | teenhome | dt_customer | recency | mnt_wines | mnt_fruits | mnt_meat_products | mnt_fish_products | mnt_sweet_products | mnt_gold_prods | num_deals_purchases | num_web_purchases | num_catalog_purchases | num_store_purchases | num_web_visits_month | accepted_cmp3 | accepted_cmp4 | accepted_cmp5 | accepted_cmp1 | accepted_cmp2 | complain | z_cost_contact | z_revenue | response | Age | Child | total_spent | accepted |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5.52e+03 | 1.96e+03 | Graduation | Single | 5.81e+04 | 0 | 0 | 04-09-2012 | 58 | 635 | 88 | 546 | 172 | 88 | 88 | 3 | 8 | 10 | 4 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 1 | 64 | 0 | 1.62e+03 | 0 |
| 2.17e+03 | 1.95e+03 | Graduation | Single | 4.63e+04 | 1 | 1 | 08-03-2014 | 38 | 11 | 1 | 6 | 2 | 1 | 6 | 2 | 1 | 1 | 2 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 67 | 2 | 27 | 0 |
| 4.14e+03 | 1.96e+03 | Graduation | Together | 7.16e+04 | 0 | 0 | 21-08-2013 | 26 | 426 | 49 | 127 | 111 | 21 | 42 | 1 | 8 | 2 | 10 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 56 | 0 | 776 | 0 |
| 6.18e+03 | 1.98e+03 | Graduation | Together | 2.66e+04 | 1 | 0 | 10-02-2014 | 26 | 11 | 4 | 20 | 10 | 3 | 5 | 2 | 2 | 0 | 4 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 37 | 1 | 53 | 0 |
| 5.32e+03 | 1.98e+03 | PhD | Married | 5.83e+04 | 1 | 0 | 19-01-2014 | 94 | 173 | 43 | 118 | 46 | 27 | 15 | 5 | 5 | 3 | 6 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 40 | 1 | 422 | 0 |
| 7.45e+03 | 1.97e+03 | Master | Together | 6.25e+04 | 0 | 1 | 09-09-2013 | 16 | 520 | 42 | 98 | 0 | 42 | 14 | 2 | 6 | 4 | 10 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 54 | 1 | 716 | 0 |
#getting rid of other columns
df1=df1[c(-1,-2,-6,-7,-8,-10,-11,-12,-13,-14,-15,-21,-22,-23,-24,-25,-27,-28)]
library(caret)
dmy <- dummyVars(" ~ .", data = df1, fullRank = T)
dat_transformed <- data.frame(predict(dmy, newdata = df1))
#glimpse(dat_transformed)
dfc=dat_transformed[c(7,8,13,14,15,16,17,18,19,20,21,22,23,24)]
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_nbclust(dfc,kmeans,method="wss")+geom_vline(xintercept=2,linetype=2)
set.seed(123)
km.res <- kmeans(dfc,2, nstart = 10)
fviz_cluster(km.res, dfc, geom = "point",ellipse.type = "norm",repel = TRUE, ggtheme = theme_minimal(), xlab="", ylab="")+
ggtitle(label="We identified 2 main consumer clusters")+
scale_color_manual(values=c("tomato", "cornflowerblue"))+
theme(plot.title = element_text(face = "bold"))
df1<-df1 %>%
select(-education, -marital_status)
df1 <- data.frame(scale(df1))
model_kmeans_2clusters <- eclust(df1, "kmeans", k = 2, nstart=50, graph=FALSE)
#add clusters to the data frame
dfc_withClusters <- mutate(df1, cluster=as.factor(model_kmeans_2clusters$cluster))
cluster_centers <- data.frame(cluster=as.factor(c(1:2)),model_kmeans_2clusters$centers)
#transpose
cluster_centers_t <- cluster_centers %>% gather(variable, value, -cluster, factor_key=TRUE)
#plot the clusters
graphkmeans_2clusters <- ggplot(cluster_centers_t, aes(x=variable, y=value))+
geom_line(aes(color=cluster, group=cluster), linetype="dashed", size=1)+
geom_point(size=1, shape=4)+
geom_hline(yintercept=0)+
theme_minimal()+
theme(text = element_text(size=10),
axis.text.x = element_text(angle=45, hjust=1),
plot.title = element_text(face = "bold"))+
ggtitle("Consumers are separated on income, amount spent and \nnumber of children", subtitle="Separation of 2 clusters based on different consumer features") +
scale_color_manual(values=c("tomato", "cornflowerblue"))+
labs(y="", x="")
graphkmeans_2clusters
loadfonts(device="pdf")
custom_layout <- theme_minimal() + theme(
text=element_text(family = "PT Sans"),
plot.title = element_text(face="bold", size = rel(1.7)),
plot.subtitle= element_text(color="grey60", size = rel(1.6)),
panel.background = element_blank(),
plot.caption = element_text(hjust = 0, size = rel(1.2), face="italic", color="grey60"),
plot.title.position = "plot",
plot.caption.position = "plot",
axis.title = element_text(color="grey50", size = rel(1.4)),
axis.ticks= element_line(color="grey50", size = rel(1.4)),
axis.line = element_line(size = 1, colour = "grey80"),
axis.text =element_text(color="grey50", size = rel(1)),
legend.title = element_text(family = "PT Sans",face = "bold", rel(1.4)))
SOPHIA’S PART –> WILL CLEAN UP AND COMMENT MORE
Before we can start with the analysis, we load and clean the data.
# Data Source: https://apps.bea.gov/regional/downloadzip.cfm
gdp <- read_csv(here::here("data", "SAGDP1__ALL_AREAS_1997_2020.csv"))
pce <- read_csv(here::here("data", "SAPCE4__ALL_AREAS_1997_2020.csv"))
# PUT LINKS HERE
states <- read_csv(here::here("data", "states.csv"))
state_pop <- read_csv(here::here("data", "2019_Census_US_Population_Data_By_State_Lat_Long.csv"))
#GDP reported in million
glimpse(gdp)
## Rows: 484
## Columns: 32
## Warning: One or more parsing issues, see `problems()` for details
## $ GeoFIPS <chr> "00000", "00000", "00000", "00000", "00000", "0…
## $ GeoName <chr> "United States", "United States", "United State…
## $ Region <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 5, 5, 5, 5, 5, …
## $ TableName <chr> "SAGDP1", "SAGDP1", "SAGDP1", "SAGDP1", "SAGDP1…
## $ LineCode <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8,…
## $ IndustryClassification <chr> "...", "...", "...", "...", "...", "...", "..."…
## $ Description <chr> "Real GDP (millions of chained 2012 dollars)", …
## $ Unit <chr> "Millions of chained 2012 dollars", "Quantity i…
## $ `1997` <dbl> 11529157.000, 70.931, 8577552.000, 4713220.000,…
## $ `1998` <dbl> 12045824.000, 74.110, 9062817.000, 5075701.000,…
## $ `1999` <dbl> 12623361.000, 77.663, 9631172.000, 5409937.000,…
## $ `2000` <dbl> 13138035.000, 80.830, 10250952.000, 5854634.000…
## $ `2001` <dbl> 13263417.000, 81.601, 10581929.000, 6046346.000…
## $ `2002` <dbl> 13488357.000, 82.985, 10929108.000, 6143370.000…
## $ `2003` <dbl> 13865519.000, 85.305, 11456450.000, 6362298.000…
## $ `2004` <dbl> 14399696.000, 88.592, 12217196.000, 6729306.000…
## $ `2005` <dbl> 14901269.000, 91.678, 13039197.000, 7077722.000…
## $ `2006` <dbl> 15315943.000, 94.229, 13815583.000, 7491260.000…
## $ `2007` <dbl> 15623871.000, 96.123, 14474228.000, 7889371.000…
## $ `2008` <dbl> 15642962.000, 96.241, 14769862.000, 8068682.000…
## $ `2009` <dbl> 15236262.000, 93.739, 14478067.000, 7767191.000…
## $ `2010` <dbl> 15648991.000, 96.278, 15048970.000, 7932970.000…
## $ `2011` <dbl> 15891534.000, 97.770, 15599731.000, 8234017.000…
## $ `2012` <dbl> 16253970.0, 100.0, 16253970.0, 8575362.0, 66005…
## $ `2013` <dbl> 16553348.000, 101.842, 16843196.000, 8843637.00…
## $ `2014` <dbl> 16932051.000, 104.172, 17550687.000, 9259654.00…
## $ `2015` <dbl> 17390295.000, 106.991, 18206023.000, 9709535.00…
## $ `2016` <dbl> 17680274.000, 108.775, 18695106.000, 9977096.00…
## $ `2017` <dbl> 18079084.000, 111.229, 19479623.000, 10436745.0…
## $ `2018` <dbl> 18606787.000, 114.475, 20527159.000, 10969807.0…
## $ `2019` <dbl> 19032672.000, 117.096, 21372582.000, 11459449.0…
## $ `2020` <dbl> 18384687.000, 113.109, 20893746.000, 11580088.0…
glimpse(pce)
## Rows: 7,985
## Columns: 32
## Warning: One or more parsing issues, see `problems()` for details
## $ GeoFIPS <chr> "00000", "00000", "00000", "00000", "00000", "0…
## $ GeoName <chr> "United States", "United States", "United State…
## $ Region <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
## $ TableName <chr> "SAPCE4", "SAPCE4", "SAPCE4", "SAPCE4", "SAPCE4…
## $ LineCode <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, …
## $ IndustryClassification <chr> "...", "...", "...", "...", "...", "...", "..."…
## $ Description <chr> "Personal consumption expenditures", "Household…
## $ Unit <chr> "Millions of current dollars", "Millions of cur…
## $ `1997` <dbl> 5536790, 5431202, 474777, 412764, 61598, 414, 2…
## $ `1998` <dbl> 5877248, 5753441, 487437, 422087, 64968, 382, 2…
## $ `1999` <dbl> 6283758, 6145529, 515530, 444945, 70235, 350, 2…
## $ `2000` <dbl> 6767179, 6609139, 540578, 463125, 77136, 318, 2…
## $ `2001` <dbl> 7073801, 6894718, 564003, 482228, 81489, 286, 2…
## $ `2002` <dbl> 7348941, 7150598, 575052, 490397, 84374, 281, 2…
## $ `2003` <dbl> 7740749, 7535238, 599580, 513578, 85728, 275, 3…
## $ `2004` <dbl> 8231960, 8025589, 632604, 542995, 89269, 340, 3…
## $ `2005` <dbl> 8769066, 8558799, 668216, 575282, 92551, 384, 3…
## $ `2006` <dbl> 9277236, 9038051, 700260, 601564, 98231, 465, 3…
## $ `2007` <dbl> 9746594, 9497813, 737332, 634713, 102215, 403, …
## $ `2008` <dbl> 10050083, 9762791, 769085, 665819, 102877, 390,…
## $ `2009` <dbl> 9891218, 9602038, 772930, 669151, 103434, 345, …
## $ `2010` <dbl> 10260256, 9965818, 786866, 678586, 107895, 386,…
## $ `2011` <dbl> 10698857, 10387001, 819542, 709051, 110123, 368…
## $ `2012` <dbl> 11047363, 10705817, 846198, 731642, 114172, 384…
## $ `2013` <dbl> 11363528, 11010938, 863994, 748277, 115227, 489…
## $ `2014` <dbl> 11847725, 11482129, 896855, 776231, 120037, 588…
## $ `2015` <dbl> 12263476, 11891892, 920955, 794884, 125449, 622…
## $ `2016` <dbl> 12693266, 12291833, 940635, 809680, 130457, 498…
## $ `2017` <dbl> 13239111, 12821163, 973072, 837665, 134975, 433…
## $ `2018` <dbl> 13913531, 13468865, 1000345, 859057, 140852, 43…
## $ `2019` <dbl> 14428676, 13988786, 1030911, 884334, 146090, 48…
## $ `2020` <chr> "14047565", "13526689", "1146676", "979256", "1…
glimpse(states)
## Rows: 51
## Columns: 3
## $ state <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "California", "C…
## $ latitude <dbl> 32.7794, 64.0685, 34.2744, 34.8938, 37.1841, 38.9972, 41.621…
## $ longitude <dbl> -86.8287, -152.2782, -111.6602, -92.4426, -119.4696, -105.54…
glimpse(state_pop)
## Rows: 51
## Columns: 4
## $ STATE <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "Californi…
## $ POPESTIMATE2019 <dbl> 4903185, 731545, 7278717, 3017804, 39512223, 5758736, …
## $ lat <dbl> 32.37772, 58.30160, 33.44814, 34.74661, 38.57667, 39.7…
## $ long <dbl> -86.30057, -134.42021, -112.09696, -92.28899, -121.493…
#Definitions of regions according to BEA
#https://apps.bea.gov/regional/docs/msalist.cfm?mlist=2
new_england <- c("Connecticut", "Maine", "Massachusetts", "New Hampshire", "Rhode Island", "Vermont")
mideast <- c("Delaware", "District of Columbia", "Maryland", "New Jersey", "New York", "Pennsylvania")
great_lakes <- c("Illinois", "Indiana", "Michigan", "Ohio", "Wisconsin")
plains <- c("Iowa", "Kansas", "Minnesota", "Missouri", "Nebraska", "North Dakota", "South Dakota")
south_east <- c("Alabama", "Arkansas", "Florida", "Georgia", "Kentucky", "Louisiana", "Mississippi", "North Carolina", "South Carolina", "Tennessee", "Virginia", "West Virginia")
southwest <- c("Arizona", "New Mexico", "Oklahoma", "Texas")
rocky_mountain <- c("Colorado", "Idaho", "Montana", "Utah", "Wyoming")
far_west <- c("Alaska", "California", "Hawaii", "Nevada", "Oregon", "Washington")
gdp_clean <- gdp %>%
#Clean names
janitor::clean_names() %>%
#Filter out row type we need
filter(description=="Real GDP (millions of chained 2012 dollars)") %>%
#Pivot longer to get right data format
pivot_longer(cols = starts_with("x"),
names_to = "year",
values_to = "GDP") %>%
#Delete first letter of year
mutate(year=substring(year, 2), year=lubridate::year(as.Date(year, format="%Y")),
#Convert GDP into billion
GDP=GDP/1000)
#Divide data into state and region level
gdp_states <- gdp_clean %>% filter(!geo_name %in% c("United States", "Far West", "Rocky Mountain", "Southwest","Southeast", "Plains", "Great Lakes", "Mideast", "New England" ))
gdp_regions <- gdp_clean %>% filter(geo_name %in% c("United States", "Far West", "Rocky Mountain", "Southwest","Southeast", "Plains", "Great Lakes", "Mideast", "New England" ))
#Join gdp_states with state latitute/longlitude data
gdp_states <- gdp_states %>% left_join(states, by=c("geo_name" = "state"))
gdp_states_2020 <- gdp_states %>% filter(year==2020) %>% filter(!geo_name %in% c("Alaska", "Hawaii"))
gdp_states_sf <- st_as_sf(gdp_states_2020,
coords=c("longitude", "latitude"),
crs=4326)
gdp_states_2020_temp <- gdp_states_2020 %>% select(geo_name, GDP)
#glimpse(gdp_states_sf)
#Create dataframe with GDP per capita
gdp_perCapita_states_2020 <- gdp_states_2020 %>% left_join(state_pop, by=c("geo_name"="STATE")) %>%
select(!lat, !long) %>%
mutate(GDP_pc=(GDP/POPESTIMATE2019)*1000000000) %>% select(geo_name, GDP_pc)
#GDP per capita for regions
region_pop <- state_pop %>% select(STATE, POPESTIMATE2019) %>%
mutate(region=case_when(
STATE %in% new_england ~ "New England",
STATE %in% mideast ~ "Mideast",
STATE %in% great_lakes ~ "Great Lakes",
STATE %in% plains ~ "Plains",
STATE %in% south_east ~ "Southeast",
STATE %in% southwest ~ "Southwest",
STATE %in% rocky_mountain ~ "Rocky Mountain",
STATE %in% far_west ~ "Far West"
)) %>% group_by(region) %>% summarise(region_pop=sum(POPESTIMATE2019))
#Simplification assumption that population is constant
gdp_pc_regions <- gdp_regions %>%
filter(geo_name != "United States") %>%
left_join(region_pop, by=c("geo_name" = "region")) %>%
mutate(gdp_pc = GDP/region_pop *1000000000)
#glimpse(pce)
pce_clean <- pce %>%
#Clean names
janitor::clean_names() %>%
mutate(x2020 = as.numeric(x2020)) %>%
#Pivot longer to get right data format
pivot_longer(cols = 9:32,
names_to = "year",
values_to = "Expenditure") %>%
mutate(year=substring(year, 2), year=lubridate::year(as.Date(year, format="%Y")))
pce_total_annual <- pce_clean %>%
filter(description=="Personal consumption expenditures", geo_name =="United States") %>% group_by(year) %>%
#filter(!is.na(Expenditure)) %>%
mutate(population=(sum(region_pop$region_pop)), exp_percapita=Expenditure/population *1000000)
pce_pc_short <- pce_total_annual %>% select(year, exp_percapita)
First, we want to get a feeling of who GDP and PCE (Personal Consumption Expenditure) is developing over time in the US to get a better feeling of the economic situation in the US.
#Join GDP, PCE data
eco_ind_us <- gdp_regions %>%
filter(geo_name == "United States") %>%
mutate(population=(sum(region_pop$region_pop)), gdp_pc=GDP/population*1000000000) %>% select(year, gdp_pc) %>%
#left_join(Inc_pc_us_short, by="year") %>%
left_join(pce_pc_short, by="year") %>%
pivot_longer(cols=2:3, names_to="indicator", values_to="value")
#Plot graphs with both indicators
ggplot(eco_ind_us, aes(x=year, y=value, group=indicator, color=indicator)) +
#Create gray background for crisis
geom_rect(aes(xmin=2008,xmax=2009.5),
ymin=-Inf,ymax=Inf, fill="#E5E6E8", color="#E5E6E8", alpha=0.035) +
geom_rect(aes(xmin=2019.5,xmax=2021),
ymin=-Inf,ymax=Inf, fill="#E5E6E8", color="#E5E6E8", alpha=0.035) + geom_rect(aes(xmin=2000,xmax=2002),
ymin=-Inf,ymax=Inf, fill="#E5E6E8", color="#E5E6E8", alpha=0.035) +
geom_line(size=1.5) +
scale_color_manual(values = c("#E6ADE6", "#024B7C"),
labels=c("PCE", "GDP"))+
custom_layout +
labs(title="Economic growth is slowed down during global economic crisis",
subtitle = "GDP and PCE development over time in USA",
x="Time",
y="GDP and PCE per capita in $",
caption= "Source: Bureau of Economic Analysis GDP and PCE Data 1997-2020",
color="Economic indicator") +
# add the text label on the graph
geom_text(
data = data.frame(x = 2008.75, y = 59000, label = "Financial Crisis"),
aes(x = x, y = y, label = label),
colour="grey15",
family="PT Sans",
hjust = 0.5,
lineheight = .8,
inherit.aes = FALSE) +
geom_text(
data = data.frame(x = 2020.2, y = 59000, label = "Covid-19"),
aes(x = x, y = y, label = label),
colour="grey15",
family="PT Sans",
hjust = 0.5,
lineheight = .8,
inherit.aes = FALSE)+
geom_text(
data = data.frame(x = 2001, y = 59000, label = "Dot-com bubble"),
aes(x = x, y = y, label = label),
colour="grey15",
family="PT Sans",
hjust = 0.5,
lineheight = .8,
inherit.aes = FALSE)
To evaluate if wine is a good industry to go into, we use the PCE data to get a first hypothesis. We calculate the propotion of alcohol spending of total consumer spending over time to get a first indication for this.
#PROPORTIONAL GROWTH OF ALCHOLIC BEVERAGES FROM TOTAL EXPENDITURE
pce_alc <- pce_clean %>% filter(description %in% c("Personal consumption expenditures","Alcoholic beverages purchased for off-premises consumption")) %>%
mutate(description = case_when(description == "Personal consumption expenditures" ~ "PCE",
description == "Alcoholic beverages purchased for off-premises consumption" ~ "Alc_beverages"))
pce_alc <- pce_alc %>% select(geo_name, year, description, Expenditure) %>%
group_by(geo_name, year) %>%
pivot_wider(names_from="description", values_from="Expenditure") %>%
mutate(Alc_pct=Alc_beverages/PCE*100)
#GRAPH ALCOHOL SPENDING AS % OF TOTAL SPENDING
pce_alc %>% filter(geo_name=="United States") %>%
ggplot(aes(x=year, y=Alc_pct)) +
geom_rect(aes(xmin=2008,xmax=2009.5),
ymin=-Inf,ymax=Inf, fill="#E5E6E8", color="#E5E6E8", alpha=0.035) +
geom_rect(aes(xmin=2019.5,xmax=2021),
ymin=-Inf,ymax=Inf, fill="#E5E6E8", color="#E5E6E8", alpha=0.035) +
geom_rect(aes(xmin=2000,xmax=2002),
ymin=-Inf,ymax=Inf, fill="#E5E6E8", color="#E5E6E8", alpha=0.035) +
geom_point(size=2, color="#5C0303") +
geom_line(size=1.2, color="#5C0303") +
custom_layout +
ylim(c(0, 1.3)) +
labs(title="Alcohol spending increases in times of crisis",
subtitle = "Proportion of alcohol beverage spending of total Consumption Expenditures over time in USA",
x="Time",
y="Alcoholic beverage spending (in %)",
caption= "Source: Bureau of Economic Analysis Personal Consumption Expenditures Data 1997-2020") +
# add the text label on the graph
geom_text(
data = data.frame(x = 2008.75, y = 1.25, label = "Financial Crisis"),
aes(x = x, y = y, label = label),
colour="grey15",
family="PT Sans",
hjust = 0.5,
lineheight = .8,
inherit.aes = FALSE) +
geom_text(
data = data.frame(x = 2020.2, y = 1.25, label = "Covid-19"),
aes(x = x, y = y, label = label),
colour="grey15",
family="PT Sans",
hjust = 0.5,
lineheight = .8,
inherit.aes = FALSE) +
geom_text(
data = data.frame(x = 2001, y = 1.25, label = "Dot-com bubble"),
aes(x = x, y = y, label = label),
colour="grey15",
family="PT Sans",
hjust = 0.5,
lineheight = .8,
inherit.aes = FALSE)
Do go a little deeper, we want to analyze GDP growth on region. It becomes visible that different regions have not only different GDP and different GDP per capita but also the average of annual growth over the last 20 years significantly differs, which should be taken into consideration when deciding where to enter the market.
# GDP over time by region
gdp_regions %>% filter(geo_name != "United States") %>%
ggplot(aes(x=year, y=GDP, group=geo_name)) +
geom_line(aes(color=geo_name)) +
custom_layout +
labs(title="Southeast has the highest GDP",
subtitle = "GDP development over time by region",
x="Time",
y="GDP in $B",
caption= "Source: Bureau of Economic Analysis GDP Data 1997-2020",
color= "Regions")
# GDP per capita over time by region
gdp_pc_regions %>%
ggplot(aes(x=year, y=gdp_pc, group=geo_name)) +
geom_line(aes(color=geo_name)) +
custom_layout +
labs(title="Far West, Mideast and New England have the highest GDP per capita",
subtitle = "GDP per capita development over time by region",
x="Time",
y="GDP per capita in $",
caption= "Source: Bureau of Economic Analysis GDP Data 1997-2020",
color= "Regions")
# GDP per capita increase from last for first for all regions
gdp_pc_regions_growth <- gdp_pc_regions %>%
filter(year %in% c(2020, 1997)) %>%
group_by(region) %>%
#pivot wider to calculate change over years
pivot_wider(names_from=year, values_from=c(gdp_pc, GDP)) %>%
select(geo_name, gdp_pc_1997, gdp_pc_2020) %>%
#calculate cagr and av. annual growth
mutate(cagr=(gdp_pc_2020/gdp_pc_1997)^(1/(2020-1997)-1), average_annual_growth=((gdp_pc_2020-gdp_pc_1997)/gdp_pc_1997)/(2020-1997)) %>%
mutate(blue=ifelse(geo_name %in% c("Southwest","Rocky Mountain", "Far West"), TRUE, FALSE))
#Plot findings from abovefct_reorder(country, beer_servings)
my_colours <- c("grey70", "#093172")
label <- "Growth mainly driven by Tech and Oil industry growth\n over the two decades"
gdp_pc_regions_growth %>% ggplot(aes(x=fct_reorder(geo_name, desc(average_annual_growth)), y=average_annual_growth, fill=blue))+
geom_col() +
scale_y_continuous(labels = scales::percent)+
scale_fill_manual(values = my_colours)+
custom_layout +
theme(
axis.title.y = element_blank(),
legend.position = "none"
)+
labs(title="The West experienced higher GDP per capita growth over the last 20 years",
subtitle = "GDP per capita development over time by region",
x="Regions",
y="Annual GDP per capita growth rate (1997-2020)",
caption= "Source: Bureau of Economic Analysis GDP Data 1997-2020")+
#add arrow to graph
geom_curve(
data = data.frame(x = 6, y = 0.03, xend = 3.7, yend = 0.038),
mapping = aes(x = x, y = y, xend = xend, yend = yend),
colour = "grey15",
size = 0.5,
curvature = 0.25,
arrow = arrow(length = unit(2, "mm"), type = "closed"),
inherit.aes = FALSE ) +
# add the text label on the graph
geom_text(
data = data.frame(x = 6, y = 0.028, label = label),
aes(x = x, y = y, label = label),
colour="grey15",
family="PT Sans",
hjust = 0.5,
lineheight = .8,
inherit.aes = FALSE)
Next we plot GDP and GDP per capita on a US map based on states. With this, differences between states become easily visible. Moreover, it is intresting to see how missleading GDP can be as an indication for average wealth when the population size is not considered, given that the two maps look very different from each other.
#GDP MAP OF US (total GDP --> economy size)
states <- read_sf(here::here("data", "cb_2018_us_state_500k.shp")) %>%
filter(!NAME %in% c("Guam", "Hawaii", "Commonwealth of the Northern Mariana Islands", "American Samoa", "United States Virgin Islands", "Alaska", "Puerto Rico"))
states_gdp <- states %>% left_join(gdp_states_2020_temp, by=c("NAME" = "geo_name"))
us_gdp_map <- ggplot() +
geom_sf(data=states_gdp, aes(fill= GDP)) +
custom_layout +
scale_fill_gradient(low = "#CEE9F7", high = "#0032BF") +
coord_sf(datum = NA) +
labs(title="California has the highest GDP followed by Texas",
subtitle = "GDP in million dollars per state (2020) ",
caption="Source: Bureau of Economic Analysis GDP Data 2020",
fill="GDP ($B)")
us_gdp_map
#GDP MAP OF US per CAPITA
states_gdp_pc <- states %>% left_join(gdp_perCapita_states_2020, by=c("NAME" = "geo_name")) %>%
#Filter out DC b/c it is not comparable (by far the highest GDP per capita)
filter(NAME != "District of Columbia")
us_gdp_map_pc <- ggplot() +
geom_sf(data=states_gdp_pc, aes(fill= GDP_pc)) +
custom_layout +
scale_fill_gradient(low = "#CEE9F7", high = "#0032BF") +
coord_sf(datum = NA)+
labs(title="Washington DC, New York and Massachusetts have the highest GDP per capita",
subtitle = "GDP per capita in dollars per state (2020) ",
caption="Source: Bureau of Economic Analysis GDP Data 2020, US Census 2019",
fill="GDP per capita ($)")
us_gdp_map_pc
We also want to understand if wine is a good business to go into. Hence we first look at the wine consumption development over the years. We find that wine consumption has grown significantly over the last 15 years in the US.
library("readxl")
#Source: https://www.statista.com/statistics/233722/total-wine-consumption-of-the-us-by-wine-type/
#Wine Consumption in the U.S. by Wine Institute
wine_total_us <- read_excel( here::here("data","totalwineconsumption.xlsx"), sheet=2)
#Clean data and rename columns
wine_total_us_clean <- wine_total_us[-1,]
wine_total_us_clean <- wine_total_us_clean[-1,] %>% rename(year="Total wine consumption of the U.S. 2005-2020", consumption="...2") %>% mutate(consumption=as.numeric(consumption))
#Calculate consumption growth from first to last year
pct_growth= round((as.numeric(wine_total_us_clean[16,2])-as.numeric(wine_total_us_clean[1,2]))/as.numeric(wine_total_us_clean[1,2])*100,2)
#Create label for annotation
label <-paste("Total consumption grew by",as.character(pct_growth),"% \n over the last 15 years", by=" ")
wine_total_us_clean %>% ggplot(aes(x=year, y=consumption)) +
geom_col(fill="#5C0303") +
custom_layout +
ylim(0, 1250)+
labs(title="Wine consumption expereinced high growth over the last 15 years",
subtitle = "Total wine consumption in million gallons (2005-2020)",
caption="Source: Wine Institute - Wine consumption survery",
y="Consumption in million gallons") +
#add arrow to graph
geom_curve(
data = data.frame(x = 1, y = 750, xend = 16, yend = 1050),
mapping = aes(x = x, y = y, xend = xend, yend = yend),
colour = "grey15",
size = 0.5,
curvature = -0.25,
arrow = arrow(length = unit(2, "mm"), type = "closed"),
inherit.aes = FALSE ) +
# add the text label on the graph
geom_text(
data = data.frame(x = 6, y = 1200, label = label),
aes(x = x, y = y, label = label),
colour="grey15",
family="PT Sans",
hjust = 0.5,
lineheight = .8,
inherit.aes = FALSE)
Similarly to the GDP analysis, we also want to understand the geographical distribution of wine consumption better. While California has the highest total consumption of wine, the New England region actually has the highest per capita consumption of wine.
library("readxl")
#Source: https://www.statista.com/statistics/942245/wine-consumption-in-the-us-by-state/
#National Institute on Alcohol Abuse and Alcoholism
wine_by_state <- read_excel( here::here("data","wineconsumptionbystate.xlsx"), sheet=2)
#Clean data and rename columns
wine_by_state_clean <- wine_by_state [-1,]
wine_by_state_clean <- wine_by_state_clean[-1,] %>% rename(state="Wine consumption in the U.S. by state 2019", consumption="...2") %>% mutate(consumption=as.numeric(consumption))
wine_by_state_pop <- wine_by_state_clean %>% left_join(state_pop, by=c("state"= "STATE")) %>%
#Consumption data reported in 1,000 of gallons
#Caclulate per capita consumption
mutate(consumption_percapita= consumption/POPESTIMATE2019*1000)
wine_by_state_total_consumption <- wine_by_state_pop %>% select(state, consumption)
wine_by_state_percapita_consumption <- wine_by_state_pop %>% select(state, consumption_percapita)
states_wine_consumption <- states %>% left_join(wine_by_state_total_consumption, by=c("NAME" = "state"))
states_wine_consumption_pc <- states %>% left_join(wine_by_state_percapita_consumption, by=c("NAME" = "state"))
#Total wine consumption
us_wine_map <- ggplot() +
geom_sf(data=states_wine_consumption, aes(fill= consumption)) +
custom_layout +
scale_fill_gradient(low = "#FFDDDD", high = "#5C0303") +
coord_sf(datum = NA)+
labs(title="California has the highest total wine consumption",
subtitle = "Wine consumption in 1,000 gallons",
caption="Source: National Institute on Alcohol Abuse and Alcoholism 2019",
fill="Total wine consumption")
us_wine_map
#Wine consumption per capita
us_wine_map_pc <- ggplot() +
geom_sf(data=states_wine_consumption_pc, aes(fill= consumption_percapita)) +
custom_layout +
scale_fill_gradient(low = "#FFDDDD", high = "#5C0303") +
coord_sf(datum = NA)+
labs(title="New England states have the highest per capita wine consumption",
subtitle = "Wine consumption per capita in gallons",
caption="Source: National Institute on Alcohol Abuse and Alcoholism 2019",
fill="Wine consumption per capita")
us_wine_map_pc
We move on to analyse the data of an retailer to gain more information about what customers spend money on wine and how we could use that data to target specific customer groups.
We take a look at the entrie food basket of the customers of the retailer. We observe that wine actually has the highest proportion of spending, hence we can infer that the retailer is not a regular grocery chain but more likely a specilaized vendor known for its wine.
#COLOR WINE BOXPLOT
my_colours <- c("grey70", "#5C0303")
# Calculate total food basket and proportions of every category
customer_food_basked <- profile_clean %>% select(id,mnt_wines,mnt_fruits,mnt_meat_products,mnt_fish_products, mnt_sweet_products) %>%
mutate(food_basket=mnt_wines+mnt_fruits+mnt_meat_products+mnt_fish_products+mnt_sweet_products,
wine_prop=mnt_wines/food_basket,
fruit_prop=mnt_fruits/food_basket,
meat_prop=mnt_meat_products/food_basket,
fish_prop=mnt_fish_products/food_basket,
sweet_prop=mnt_sweet_products/food_basket)
#Plot proportion distribution
customer_food_basked %>%
select(id, wine_prop, fruit_prop, meat_prop, fish_prop, sweet_prop) %>%
pivot_longer(cols=2:6, names_to="type", values_to="proportion") %>%
mutate(type = fct_reorder(type, proportion, .fun='median')) %>%
mutate(wine=ifelse(type == "wine_prop", TRUE, FALSE)) %>%
ggplot(aes(x=reorder(type, desc(proportion)), y=proportion, group=type, color=wine)) +
geom_boxplot() +
scale_x_discrete(labels= c("Wine", "Meat", "Fish", "Sweets", "Fruits")) +
custom_layout +
scale_color_manual(values = my_colours)+
theme(
axis.title.y = element_blank(),
legend.position = "none" ) +
labs(title="Wine has highest proportion indicating that retailer is a delicatessen or specialized vendor",
subtitle = "Proportion of food basket by food category",
caption="Source: Customer Personality Analysis Data Set",
x= "Food category",
y= "Proportion of food basket")
We know now that the customers of the retailer do appear to purchase a lot on wine, hence we want to use the data to gain more insights on what characteristics the high wine spenders have, which we could then ultimately use to target our customers. From the graphs below we can see that especially income and age appear to impact the median spending on wine consumption. This already gives us a first indication of what target customers we could focus on.
#TO DOS::
#ANNOTATION !!!
#We will use median as the wine spending data appears to be highly skewed right
my_colours <- c("grey70", "#5C0303")
#MARITAL STATUS
profile_clean %>% group_by(marital_status) %>%
summarise(median=median(mnt_wines), count=n()) %>%
filter(marital_status %in% c("Divorced", "Married", "Single","Together", "Widow")) %>%
mutate(wine=ifelse(marital_status == "Widow", TRUE, FALSE)) %>%
ggplot(aes(x=fct_reorder(marital_status, desc(median)), y=median, fill=wine)) +
geom_col() +
custom_layout +
scale_fill_manual(values = my_colours)+
theme(
axis.title.y = element_blank(),
legend.position = "none" ) +
labs(title="Widowed people appear to spend much more on wine",
subtitle = "Median wine spending by marital stuatus",
caption="Source: Customer Personality Analysis Data Set",
x= "Marital status",
y= "Median wine spending")
#EDUCATION
profile_clean %>% group_by(education) %>%
summarise(median=median(mnt_wines), count=n()) %>%
filter(education %in% c("2n Cycle", "Graduation", "Master","PhD")) %>%
mutate(wine=ifelse(education == "PhD", TRUE, FALSE)) %>%
ggplot(aes(x=education, y=median, fill=wine)) +
geom_col() +
custom_layout +
scale_fill_manual(values = my_colours)+
theme(
axis.title.y = element_blank(),
legend.position = "none") +
labs(title="The higher the education, the more people spend on wine",
subtitle = "Median wine spending by education level",
caption="Source: Customer Personality Analysis Data Set",
x= "Education level",
y= "Median wine spending")
#BY CHILDREN AT HOME
profile_clean %>% group_by(kidhome) %>%
summarise(median=median(mnt_wines), count=n()) %>%
ggplot(aes(x=as.factor(kidhome), y=median)) +
geom_col() +
custom_layout +
labs(title="Small children in the household decrease wine spending",
subtitle = "Median wine spending by number of kids",
caption="Source: Customer Personality Analysis Data Set",
x= "Number of kids at home",
y= "Median wine spending")
#BY TEEN AT HOME
profile_clean %>% group_by(teenhome) %>%
summarise(median=median(mnt_wines), count=n()) %>%
ggplot(aes(x=as.factor(teenhome), y=median)) +
geom_col() +
custom_layout +
labs(title="Having teenagers in the house increases wine spending",
subtitle = "Median wine spending by number of teenager",
caption="Source: Customer Personality Analysis Data Set",
x= "Number of teenager at home",
y= "Median wine spending")
#We will add column age and child for further analysis
profile_clean['Age']= 2021-profile_clean$year_birth
profile_clean['Child']=profile_clean$kidhome+profile_clean$teenhome
#BY CHILDREN AT HOME (KIDS+TEENAGERS)
profile_clean %>% group_by(Child) %>%
summarise(median=median(mnt_wines), count=n()) %>%
ggplot(aes(x=as.factor(Child), y=median)) +
geom_col() +
custom_layout +
labs(title="Customers without children at home spend more on wine",
subtitle = "Median wine spending by number of children",
caption="Source: Customer Personality Analysis Data Set",
x= "Number of children at home",
y= "Median wine spending")
#BY AGE (only look at 30-70 because other categories are too small (not enough data))
profile_clean %>% group_by(Age) %>%
filter(Age>=30, Age <70) %>%
mutate(age_group= case_when(
Age %in% c(30,31,32,33,34) ~ "30-34",
Age %in% c(35,36,37,38,39) ~ "35-39",
Age %in% c(40,41,42,43,44) ~ "40-44",
Age %in% c(45,46,47,48,49) ~ "45-49",
Age %in% c(50,51,52,53,54) ~ "50-54",
Age %in% c(55,56,57,58,59) ~ "55-59",
Age %in% c(60,61,62,63,64) ~ "60-64",
Age %in% c(65,66,67,68,69) ~ "65-69",
)) %>%
group_by(age_group) %>%
summarise(median=median(mnt_wines), count=n()) %>%
mutate(wine=ifelse(age_group %in% c("55-59", "60-64", "65-69"), TRUE, FALSE)) %>%
ggplot(aes(x=age_group, y=median, fill=wine)) +
geom_col() +
custom_layout +
scale_fill_manual(values = my_colours)+
theme(
axis.title.y = element_blank(),
legend.position = "none" ) +
labs(title="With age the wine expenditure appears to increase",
subtitle = "Median wine spending by age group",
caption="Source: Customer Personality Analysis Data Set",
x= "Age group",
y= "Median wine spending")
## MEDIAN WINE SPENDING BY INCOME
profile_clean %>% group_by(income) %>%
mutate(income_group= case_when(
income <= 30000 ~ "<30k",
income <= 50000 ~ "30-50k",
income <= 70000 ~ "50-70k",
income <= 90000 ~ "70-90k",
income > 90000 ~ ">90k")) %>%
group_by(income_group) %>%
summarise(median=median(mnt_wines), count=n()) %>%
mutate(income_group = fct_reorder(income_group, median, .fun='median')) %>%
mutate(wine=ifelse(income_group %in% c("70-90k", ">90k"), TRUE, FALSE)) %>%
ggplot(aes(x=reorder(income_group, median), y=median, fill=wine)) +
geom_col() +
custom_layout +
scale_fill_manual(values = my_colours)+
theme(
axis.title.y = element_blank(),
legend.position = "none" ) +
labs(title="High income groups tend to spend more on wine consumption",
subtitle = "Median wine spending by income group",
caption="Source: Customer Personality Analysis Data Set",
x= "Annual income",
y= "Median wine spending")
We observed that age appears to be an influencing factor for wine expenditure. However, we have not explored this variable before. To get a better understanding of the US in terms of median age, we look again at the map of the US by median age. It becomes apparent that the North Eastern region of the US appears to have a higher median age.
library("readxl")
#Source: https://www.statista.com/statistics/208048/median-age-of-population-in-the-usa-by-state/
#US Census Bureau 2019
median_age <- read_excel( here::here("data","medianageus.xlsx"), sheet=2)
#Clean data and rename columns
median_age_clean <- median_age[-1,]
median_age_clean <- median_age_clean[-1,] %>% rename(state="Median age of U.S. population in by state 2019", median_age="...2") %>% mutate(median_age=as.numeric(median_age))
median_age_pop <- median_age_clean %>% left_join(state_pop, by=c("state"= "STATE")) %>%
filter(state!="United States", state!="Puerto Rico")
median_age_pop_temp <- median_age_pop %>% select(state, median_age)
states_median_age <- states %>% left_join(median_age_pop_temp, by=c("NAME" = "state"))
#Median age by state
us_age_map <- ggplot() +
geom_sf(data=states_median_age, aes(fill= median_age)) +
custom_layout +
scale_fill_gradient(low = "#DEFFDA", high = "#064500") +
coord_sf(datum = NA)+
labs(title="Northeast has the highest median age",
subtitle = "Median age by state",
caption="Source: US Census Bureau 2019",
fill="Median age")
us_age_map
library("readr")
profiles_cleaned <- read_csv(here::here("profiles_cleaned.csv"))
## New names:
## * `` -> ...1
## Rows: 440 Columns: 30
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): education, marital_status, dt_customer
## dbl (27): ...1, id, year_birth, income, kidhome, teenhome, recency, mnt_wine...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
profiles_cleaned %>%
mutate(age = 2021-year_birth) %>%
select(age, income, kidhome, mnt_wines, mnt_fish_products, mnt_meat_products) %>%
ggcorr(label_round=2,label=TRUE,hjust=0.75 )
QUESTION: IS THIS JUST WINE CONSUMERS? –> need to identify high spent/high profit customers IS THERE ALSO A WAY TO ONLY INCLUDE SOME OF THE VARIABLES FOR CLUSTREING (AGE; EDUCATION; MARITAL STAUTS; INCOME; CHILDRES) –> only to ones we use in customer analysis –> happy to talk about this
#combining some columns into 1
df1=na.omit(profile_clean)
df1['Age']= 2021-df1$year_birth
df1['Child']=df1$kidhome+df1$teenhome
df1['total_spent']=df1$mnt_meat_products+df1$mnt_fish_products+df1$mnt_wines+df1$mnt_fruits+df1$mnt_sweet_products+df1$mnt_gold_prods
df1['accepted']=df1$accepted_cmp1+df1$accepted_cmp2+df1$accepted_cmp3+df1$accepted_cmp4+df1$accepted_cmp5
head(df1)
| id | year_birth | education | marital_status | income | kidhome | teenhome | dt_customer | recency | mnt_wines | mnt_fruits | mnt_meat_products | mnt_fish_products | mnt_sweet_products | mnt_gold_prods | num_deals_purchases | num_web_purchases | num_catalog_purchases | num_store_purchases | num_web_visits_month | accepted_cmp3 | accepted_cmp4 | accepted_cmp5 | accepted_cmp1 | accepted_cmp2 | complain | z_cost_contact | z_revenue | response | Age | Child | total_spent | accepted |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5.52e+03 | 1.96e+03 | Graduation | Single | 5.81e+04 | 0 | 0 | 04-09-2012 | 58 | 635 | 88 | 546 | 172 | 88 | 88 | 3 | 8 | 10 | 4 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 1 | 64 | 0 | 1.62e+03 | 0 |
| 2.17e+03 | 1.95e+03 | Graduation | Single | 4.63e+04 | 1 | 1 | 08-03-2014 | 38 | 11 | 1 | 6 | 2 | 1 | 6 | 2 | 1 | 1 | 2 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 67 | 2 | 27 | 0 |
| 4.14e+03 | 1.96e+03 | Graduation | Together | 7.16e+04 | 0 | 0 | 21-08-2013 | 26 | 426 | 49 | 127 | 111 | 21 | 42 | 1 | 8 | 2 | 10 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 56 | 0 | 776 | 0 |
| 6.18e+03 | 1.98e+03 | Graduation | Together | 2.66e+04 | 1 | 0 | 10-02-2014 | 26 | 11 | 4 | 20 | 10 | 3 | 5 | 2 | 2 | 0 | 4 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 37 | 1 | 53 | 0 |
| 5.32e+03 | 1.98e+03 | PhD | Married | 5.83e+04 | 1 | 0 | 19-01-2014 | 94 | 173 | 43 | 118 | 46 | 27 | 15 | 5 | 5 | 3 | 6 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 40 | 1 | 422 | 0 |
| 7.45e+03 | 1.97e+03 | Master | Together | 6.25e+04 | 0 | 1 | 09-09-2013 | 16 | 520 | 42 | 98 | 0 | 42 | 14 | 2 | 6 | 4 | 10 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 11 | 0 | 54 | 1 | 716 | 0 |
#getting rid of other columns
df1=df1[c(-1,-2,-6,-7,-8,-10,-11,-12,-13,-14,-15,-21,-22,-23,-24,-25,-27,-28)]
library(caret)
dmy <- dummyVars(" ~ .", data = df1, fullRank = T)
dat_transformed <- data.frame(predict(dmy, newdata = df1))
#glimpse(dat_transformed)
dfc=dat_transformed[c(7,8,13,14,15,16,17,18,19,20,21,22,23,24)]
library(factoextra)
fviz_nbclust(dfc,kmeans,method="wss")+geom_vline(xintercept=2,linetype=2) + custom_layout
set.seed(123)
km.res <- kmeans(dfc,2, nstart = 10)
fviz_cluster(km.res, dfc, geom = "point",ellipse.type = "norm",repel = TRUE, ggtheme = theme_minimal(), xlab="", ylab="")+
ggtitle(label="We identified 2 main consumer clusters")+
scale_color_manual(values=c("tomato", "cornflowerblue"))+
theme(plot.title = element_text(face = "bold")) + custom_layout
df1<-df1 %>%
select(-education, -marital_status)
df1 <- data.frame(scale(df1))
model_kmeans_2clusters <- eclust(df1, "kmeans", k = 2, nstart=50, graph=FALSE)
#add clusters to the data frame
dfc_withClusters <- mutate(df1, cluster=as.factor(model_kmeans_2clusters$cluster))
cluster_centers <- data.frame(cluster=as.factor(c(1:2)),model_kmeans_2clusters$centers)
#transpose
cluster_centers_t <- cluster_centers %>% gather(variable, value, -cluster, factor_key=TRUE)
#plot the clusters
graphkmeans_2clusters <- ggplot(cluster_centers_t, aes(x=variable, y=value))+
geom_line(aes(color=cluster, group=cluster), linetype="dashed", size=1)+
geom_point(size=1, shape=4)+
geom_hline(yintercept=0)+
custom_layout+
theme(text = element_text(size=10),
axis.text.x = element_text(angle=45, hjust=1),
plot.title = element_text(face = "bold"))+
ggtitle("Consumers are separated on income, amount spent and \nnumber of children", subtitle="Separation of 2 clusters based on different consumer features") +
scale_color_manual(values=c("tomato", "cornflowerblue"))+
labs(y="", x="")
graphkmeans_2clusters